1 Data pre-processing

This document is to analyse the Expression of the ubiquitin-related genes, immunoproteasome, HLA molecules, etc. and see if there is a pattern that distinguishes responders from non-responders.

### Load packages, functions and paths
source(file = "../01_install.R")
source(file = "../02_functions.R")
source(file = "../03_paths.R")

### Read data & wrangle
# Clincal metadata
clinical_data_path <- file.path(results_pc_path,
                           "clinical_metadata.csv")
clinical_df <- read.csv(clinical_data_path, header = TRUE)[ , -1]

# Save into RDS file
clinical_metadata_path <- file.path(results_expression_path, 
                      "clinical_metadata.rds")
saveRDS(clinical_df, file = clinical_metadata_path)

# TPM matrix
mm909_path <- file.path(data_path,
                        "MM909_data.RData")
load(mm909_path)
tpm_matrix <- tpm_ensg

# Get hugo_symbols
hugo_symbols_tpm <- rownames(tpm_matrix)

## Filtering out patients with missing data from rna_seq_raw and clinical_df
tpm_matrix_columns <- names(tpm_matrix)

# Subset clinical_df
clinical_df <- clinical_df %>% 
  dplyr::filter(Sample.ID %in% tpm_matrix_columns)

# Rename Sample.ID column
clinical_df <- clinical_df %>% 
  dplyr::rename(SampleID = Sample.ID,
                Response = Patient.Response,
                TMB = mut_load)

# Add hugo_symbols to df
tpm_matrix <- tpm_matrix %>% 
  dplyr::mutate(hgnc_symbol = hugo_symbols_tpm)
## Convert Hugo symbols to ensembl gene id

#1 Use grch37
ensembl_ids <- SYMBOLtoENSEMBL(hugo_symbols_tpm)

# Filter ensembl gene ids
ensembl_ids_tpm <- ensembl_ids[str_detect(ensembl_ids, "^ENSG")]
remaining_hugo <- ensembl_ids[!str_detect(ensembl_ids, "^ENSG")]

#2 Use grch38 for remaining
ensembl_ids <- SYMBOLtoENSEMBL38(remaining_hugo)

# Filter ensembl gene ids
ensembl_ids_tpm2 <- ensembl_ids[str_detect(ensembl_ids, "^ENSG")]
remaining_hugo <- ensembl_ids[!str_detect(ensembl_ids, "^ENSG")]

# (tried using biomaRt with grch37 but it couldn't map more hugo symbols to ensembl gene ids)

#3 Use biomaRt with grch38
# Connect to the GO BioMart database
ensembl = biomaRt::useMart(biomart="ENSEMBL_MART_ENSEMBL",
                  host="https://www.ensembl.org",
                  path="/biomart/martservice",
                  dataset="hsapiens_gene_ensembl")

# Retrieve hugo & Ensembl gene ids
hugo2ensembl_biomart_res <- biomaRt::getBM(attributes = c("external_gene_name",
                                                        "ensembl_gene_id"),
                   filters = "external_gene_name",
                   values = remaining_hugo,
                   mart = ensembl)

# Make sure 1 to 1 mapping
hugo2ensembl_biomart_df <- hugo2ensembl_biomart_res %>%
  dplyr::rename(hgnc_symbol = external_gene_name) %>% 
  dplyr::arrange(hgnc_symbol, ensembl_gene_id) %>%
  dplyr::group_by(hgnc_symbol) %>%
  dplyr::slice(1) %>%
  dplyr::ungroup()

#4 Merge results
# Convert character vectors to data.frames
df1 <- data.frame(ensembl_gene_id = ensembl_ids_tpm, 
                  hgnc_symbol = names(ensembl_ids_tpm), 
                  stringsAsFactors = FALSE)
df2 <- data.frame(ensembl_gene_id = ensembl_ids_tpm2, 
                  hgnc_symbol = names(ensembl_ids_tpm2), 
                  stringsAsFactors = FALSE)

# Combine all data.frames
combined_df <- rbind(df1, df2, hugo2ensembl_biomart_df)

# Ensure each hgnc_symbol maps to only one ensembl_gene_id
final_ensembl_tpm <- combined_df %>%
  dplyr::group_by(hgnc_symbol) %>%
  dplyr::summarise(ensembl_gene_id = dplyr::first(ensembl_gene_id)) %>%
  dplyr::ungroup()

# Find out missing hugo symbols & convert aliases
remaining_hugo <- setdiff(hugo_symbols_tpm, final_ensembl_tpm$hgnc_symbol)
corrected_remaining_hugo <- alias2SymbolTable(remaining_hugo, species = "Hs")

# Convert character vector to data.frame
df3 <- data.frame(hgnc_symbol = remaining_hugo,
                  corrected_hugo = corrected_remaining_hugo,
                  stringsAsFactors = FALSE)


# Use biomaRt again
hugo2ensembl_biomart_res <- biomaRt::getBM(attributes = c("external_gene_name",
                                                        "ensembl_gene_id"),
                   filters = "external_gene_name",
                   values = df3$corrected_hugo,
                   mart = ensembl)

# Make sure 1 to 1 mapping
hugo2ensembl_biomart_df <- hugo2ensembl_biomart_res %>%
  dplyr::rename(corrected_hugo = external_gene_name) %>% 
  dplyr::arrange(corrected_hugo, ensembl_gene_id) %>%
  dplyr::group_by(corrected_hugo) %>%
  dplyr::slice(1) %>%
  dplyr::ungroup()

# Merge original hugo symbol
df3 <- df3 %>% 
  dplyr::left_join(hugo2ensembl_biomart_df, by="corrected_hugo") %>% 
  dplyr::distinct()

remaining_hugo <- df3 %>% dplyr::filter(is.na(ensembl_gene_id))

# Manually annotate if I can
remaining_hugo <- remaining_hugo %>% 
  dplyr::mutate(
    ensembl_gene_id = case_when(
      hgnc_symbol == "DDIT4-AS1" ~ "ENSG00000269926",
      hgnc_symbol == "LINC01869" ~ "ENSG00000180279",
      hgnc_symbol == "LINC02310" ~ "ENSG00000258537",
      hgnc_symbol == "LINC02618" ~ "ENSG00000293386",
      hgnc_symbol == "LINC02677" ~ "ENSG00000242147"
    )
  )

# Merge
df3 <- df3 %>% 
  dplyr::left_join(remaining_hugo, by=c("hgnc_symbol", "corrected_hugo")) %>% 
  dplyr::mutate(ensembl_gene_id = coalesce(ensembl_gene_id.y, ensembl_gene_id.x)) %>% # Prefer 'remaining_hugo' values
  dplyr::select(-ensembl_gene_id.x, -ensembl_gene_id.y) %>% # Remove the temporary columns
  dplyr::distinct()
remaining_hugo <- df3 %>% dplyr::filter(is.na(ensembl_gene_id))
# 6 hugo symbols remaining that are deprecated

# Remove NAs
final_df3 <- df3 %>% 
  dplyr::filter(!is.na(ensembl_gene_id)) %>% 
  dplyr::select(-corrected_hugo)

# Merge with final df
final_ensembl_tpm <- rbind(final_ensembl_tpm, final_df3)

# Put ensembl gene id to tpm_matrix
final_tpm_matrix <- tpm_matrix %>% 
  dplyr::left_join(final_ensembl_tpm, by="hgnc_symbol") %>% 
  dplyr::filter(!is.na(ensembl_gene_id))

genes <- final_tpm_matrix$ensembl_gene_id
tpm_matrix <- final_tpm_matrix %>% 
  dplyr::select(-hgnc_symbol, -ensembl_gene_id) %>% 
  as.matrix()

rownames(tpm_matrix) <- genes
tpm_matrix_def <- removeDuplicateGenesDF(tpm_matrix)
# Merge in a list with names = (raw, norm, fpkm, tpm, Clinical)
input_list_expression <- list(
  "raw" = tpm_matrix_def, 
  "norm" = tpm_matrix_def, 
  "fpkm" = tpm_matrix_def, 
  "tpm" = tpm_matrix_def,
  "cpm" = tpm_matrix_def,
  "Clinical" = clinical_df)

# Save into RDS file
input_list_path <- file.path(results_expression_path, 
                      "input_list_expression.rds")
saveRDS(input_list_expression, file = input_list_path)

# Save TPM matrix only
input_tpm_path <- file.path(results_expression_path, 
                      "tpm_matrix.rds")
saveRDS(tpm_matrix_def, file = input_tpm_path)

2 Merge output of expression analysis with clinical and ubiquitin data (after running Aimilia’s pipeline)

biomarkers_out <- readRDS(file.path(data_path, "../MM909_MEL_TPM_biomarkers.rds"))

# Add clinical metadata & new response categories
biomarkers_clinical <- biomarkers_out %>% 
  dplyr::left_join(clinical_df, by="SampleID") %>% 
  dplyr::mutate(resp_3_cat = case_when(
    RECIST %in% c("CR", "PR") ~ "R", 
    RECIST %in% c("SD") ~ "SD", 
    RECIST %in% c("PD") ~ "NR", 
    TRUE ~ NA_character_
    ))

# Process ubi info and merge
annot_lof_ub_path <- file.path(results_ubfora_path, 
                      "annot_lof_ub.rds")
annot_lof_ub_df <- readRDS(annot_lof_ub_path)

cat_lof_ub_df <- annot_lof_ub_df %>% 
  dplyr::mutate(ub_category = case_when(
    category %in% c("E1_genes", "E2_genes", "E3_genes") ~ "ubiquitination",
    category %in% c("C19_genes", "dub_genes") ~ "deubiquitination",
    TRUE ~ NA_character_
  ))

# Save RDS
cat_lof_ub_path <- file.path(results_expression_path, 
                      "cat_lof_ub.rds")
saveRDS(cat_lof_ub_df, file = cat_lof_ub_path)

reduced_cat_lof_ub_df <- cat_lof_ub_df %>% 
  dplyr::select(sample_id, ub_category) %>% 
  dplyr::group_by(sample_id) %>%
  dplyr::summarise(
    ubi_lof = as.integer(any(ub_category == "ubiquitination")),
    deubi_lof = as.integer(any(ub_category == "deubiquitination"))
  )

# Add ubiquitin lof info
biomarkers_clinical_ubi <- biomarkers_clinical %>% 
  dplyr::left_join(reduced_cat_lof_ub_df, by=c("SampleID" = "sample_id")) %>% 
  tidyr::replace_na(list(ubi_lof = 0, deubi_lof = 0))

# Add ORA results
fora_results_lof_csv_path <- file.path(results_ubfora_path, "fora_results_ubiquitin_lof.csv")
fora_results_ubiquitin_lof_df <- read.csv(fora_results_lof_csv_path, header = TRUE)[ , -1]

fora_ubi_lof_clean_df <- fora_results_ubiquitin_lof_df %>%
  dplyr::rename(count = overlap) %>% 
  dplyr::select(sample_id, pathway, count, size) %>% 
  dplyr::mutate(gene_ratio = count / size) %>% 
  dplyr::filter(!sample_id %in% c("MM909_20", "MM909_24"))

temp_data_df <- fora_ubi_lof_clean_df %>% 
  dplyr::left_join(biomarkers_clinical_ubi %>% dplyr::select(SampleID, TumorPurity), 
                   by=c("sample_id" = "SampleID"))

# Response df
response_df <- biomarkers_clinical_ubi %>% 
  dplyr::rename(sample_id = SampleID) %>% 
  dplyr::select(sample_id, Response) %>% 
  dplyr::distinct()

# Summarize the ubi counts
summarized_data <- temp_data_df %>%
  dplyr::group_by(sample_id) %>%
  dplyr::summarize(
    ubi_counts = sum(count[pathway %in% c("Ubiquitin-protein ligase family", "Ubiquitin-activating enzymes", "Ubiquitin-conjugating enzymes")]),
    deubi_counts = sum(count[pathway %in% c("Deubiquitinases", "Peptidase family C19")])
  )

# Prepare input full df
biomarkers_clinical_ubi_counts <- biomarkers_clinical_ubi %>% 
  dplyr::rename(IMPRES = IMRES,
                IP_Wu = IP_score) %>% 
  dplyr::left_join(summarized_data, by=c("SampleID" = "sample_id"))

# Save into RDS file
full_df_path <- file.path(results_expression_path, 
                      "biomarkers_clinical_ubi_counts.rds")
saveRDS(biomarkers_clinical_ubi_counts, file = full_df_path)

2.1 Full output table

# Datatable of output
datatable(biomarkers_clinical_ubi_counts, 
          extensions = c('Buttons', 
                         'FixedColumns'), 
          options = list(
            dom = 'Bfrtip',
            buttons = c('copy', 'excel', 'csv'),
            scrollX=TRUE,
            pageLength=8,
            fixedColumns = list(leftColumns = 2), # Freeze the first 2 columns
            columnDefs = list(list(
              targets = "_all",
              render = JS(
                "function(data, type, row, meta) {",
                "  return data === null ? 'NA' : data;",
                "}"
              )
            ))
          ),
          caption = "Expression analysis of Cancer Immunity Cycle biomarkers merged with clinical metadata and LoF ubi mutations (binary and count)."
        )

The numeric biomarkers selected to evaluate Cancer Immunity Cycle are the following:

  • STEP1: Release_of_cancer_cell_antigens_S1_TIP
  • STEP2: Cancer_antigen_presentation_S2_TIP, APM_Senb, APMS_Thomson, IP_Kalaora, IP_Wu, MIAS
  • STEP3: Priming_and_activation_S3_TIP
  • STEP4: Trafficking_of_immune_cells_into_tumors_overall_S4_TIP
  • STEP5: Infiltration_of_immune_cells_into_tumors_S5_TIP, Cytotoxic_cells_cTME, Dendritic_cells_cTME, B_cells_cTME, T_cells_CD8_cTME, T_cells_CD4_cTME, T_regulatory_cells_cTME, Dysfunction, Exclusion
  • STEP6: Recognition_of_cancer_cells_by_T_cells_S6_TIP, IMPRES
  • STEP7: Killing_of_cancer_cells_S7_TIP, ImmuneCyt_Davoli, Tinflam_Ayers, Ecotype, TME

3 Boxplots

Here I compare APM_Senb with APMS_Thomson and IP_Kalaora with IP_Wu to decide only 1 of each pair since they are measuring the same and keeping both would be redundant.

# Loop through the biomarkers and generate a boxplot for each
plots <- list()
for (biomarker in c("APM_Senb", "APMS_Thomson", "IP_Kalaora", "IP_Wu")) {
  p <- create_biomarker_boxplot(biomarkers_clinical_ubi_counts, biomarker)
  plots[[biomarker]] <- p
}

# Print the plots
for (biomarker in c("APM_Senb", "APMS_Thomson", "IP_Kalaora", "IP_Wu")) {
  print(plots[[biomarker]])
}

Since none is statistically significant, I chose to keep APM_Senb because it is based on HLA genes and IP_Wu because it takes into account the 3 immunoproteasome subunits.

# Selection of numeric biomarkers
biomarkers_numeric_df <- biomarkers_clinical_ubi_counts %>% 
  dplyr::select(SampleID, Response, resp_3_cat, Release_of_cancer_cell_antigens_S1_TIP, Cancer_antigen_presentation_S2_TIP, APM_Senb, IP_Wu, MIAS, Priming_and_activation_S3_TIP, Trafficking_of_immune_cells_into_tumors_overall_S4_TIP, Infiltration_of_immune_cells_into_tumors_S5_TIP, Cytotoxic_cells_cTME, Dendritic_cells_cTME, B_cells_cTME, T_cells_CD8_cTME, T_cells_CD4_cTME, T_regulatory_cells_cTME, Dysfunction, Exclusion, Recognition_of_cancer_cells_by_T_cells_S6_TIP, IMPRES, Killing_of_cancer_cells_S7_TIP, ImmuneCyt_Davoli, Tinflam_Ayers, TumorPurity)
biomarkers_numeric_vec <- colnames(biomarkers_numeric_df)[-c(1, 2, 3)]

# Selection of categorical biomarkers
biomarkers_categorical_df <- biomarkers_clinical_ubi_counts %>% 
  dplyr::select(SampleID, Response, resp_3_cat, Ecotype, TME) %>% 
  dplyr::mutate(Response = as.factor(Response),
                resp_3_cat = as.factor(resp_3_cat),
                TME = as.factor(TME),
                Ecotype = as.factor(Ecotype))
biomarkers_categorical_vec <- colnames(biomarkers_categorical_df)[-c(1,2,3)]

# Selection of ubiquitin features
biomarkers_ubiquitin_df <- biomarkers_clinical_ubi_counts %>% 
  dplyr::select(SampleID, Response, resp_3_cat, ubi_lof, deubi_lof, ubi_counts, deubi_counts)
biomarkers_ubiquitin_vec <- colnames(biomarkers_categorical_df)[-c(1,2,3)]

# Individual genes pre-processing
hugo_ensembl_ub <- annot_lof_ub_df %>% 
  dplyr::select(corrected_hugo_symbol, ensembl_id) %>% 
  dplyr::distinct()

ensembl_ub_vec <- hugo_ensembl_ub %>% 
  dplyr::pull(ensembl_id)

## Ubiquitin-related genes with LoF in my data
hugo_ub_vec <- hugo_ensembl_ub %>% 
  dplyr::pull(corrected_hugo_symbol)

## Save into RDS file
ensembl_ub_path <- file.path(results_expression_path, 
                      "ensembl_ub_vector.rds")
saveRDS(ensembl_ub_vec, file = ensembl_ub_path)
hugo_ub_path <- file.path(results_expression_path, 
                      "hugo_ub_vector.rds")
saveRDS(hugo_ub_vec, file = hugo_ub_path)

## Proteasome subunit HGNC symbols
proteas_vec <- c("PSMB5", "PSMB6", "PSMB7", "PSMB8", "PSMB9", "PSMB10")

# Selection of ubiquitin genes
biomarkers_ubi_exp_df <- biomarkers_clinical_ubi_counts %>% 
  dplyr::select(SampleID, Response, resp_3_cat, one_of(hugo_ub_vec))
biomarkers_ubi_exp_vec <- colnames(biomarkers_ubi_exp_df)[-c(1,2,3)]
  
# Selection of proteasome genes
biomarkers_proteas_exp_df <- biomarkers_clinical_ubi_counts %>% 
  dplyr::select(SampleID, Response, resp_3_cat, one_of(proteas_vec))
biomarkers_proteas_exp_vec <- colnames(biomarkers_proteas_exp_df)[-c(1,2,3)]

4 Compute correlation matrix

# Fig15

# Filter the data
df2cor <- biomarkers_numeric_df %>% 
  dplyr::select(-SampleID, -Response, -resp_3_cat)

# Compute the correlation matrix
cor_matrix <- cor(df2cor, use = "pairwise.complete.obs", method = "pearson")

# Compute Euclidean distance from the correlation matrix
dist_matrix <- dist(cor_matrix, method = "euclidean")

# Perform hierarchical clustering
hc <- hclust(dist_matrix, method = "complete")

# Order the correlation matrix according to the clustering
ordered_cor_matrix <- cor_matrix[hc$order, hc$order]

# Visualize the correlation matrix with clustering and reduced text size
plot_path <- file.path(figures_expression_path, "biomarkers_correlation.png")
png(plot_path, width = 1000, height = 800)
corrplot(ordered_cor_matrix, method = "color", type = "upper", order = "original",
         tl.col = "black", tl.srt = 45, addCoef.col = "black", number.cex = 0.65, tl.cex = 0.8) # With numbers
# corrplot(ordered_cor_matrix, method = "color", type = "upper", order = "original",
#          tl.col = "black", tl.srt = 45) # Without numbers

# Add title to the plot
title(main = NULL, cex.main = 1.5)

dev.off()
## quartz_off_screen 
##                 2
knitr::include_graphics(plot_path)

5 Tile plot biomarkers + ubi info

Selected pheatmap_biomarkers_split_response_ubi_binary for thesis, but can uncomment to see other options

# Fig17

# # pheatmap_biomarkers_both_response_ubi_binary
# p1 <- pheatmap_biomarkers(biomarkers_clinical_ubi_counts, biomarkers_numeric_vec, biomarkers_categorical_vec, response_col = "Response", ubi_counts = FALSE, split_response = FALSE)
# 
# # Save the plot using the png function
# plot_path <- file.path(figures_expression_path, "pheatmap_biomarkers_both_response_ubi_binary.png")
# png(filename = plot_path, width = 8, height = 6, units = "in", res = 300)
# print(p1)
# dev.off()

# pheatmap_biomarkers_split_response_ubi_binary
p2 <- pheatmap_biomarkers(biomarkers_clinical_ubi_counts, biomarkers_numeric_vec, biomarkers_categorical_vec, response_col = "Response", ubi_counts = FALSE, split_response = TRUE)

print(p2)

# Save the ggplot
plot_path <- file.path(figures_expression_path,
                      "pheatmap_biomarkers_split_response_ubi_binary.png")
ggsave(plot_path, p2, width = 24, height = 10, dpi = 300)

# # pheatmap_biomarkers_both_response_ubi_counts
# p3 <- pheatmap_biomarkers(biomarkers_clinical_ubi_counts, biomarkers_numeric_vec, biomarkers_categorical_vec, response_col = "Response", ubi_counts = TRUE, split_response = FALSE)
# 
# # Save the plot using the png function
# plot_path <- file.path(figures_expression_path, "pheatmap_biomarkers_both_response_ubi_counts.png")
# png(filename = plot_path, width = 10, height = 6, units = "in", res = 300)
# print(p3)
# dev.off()
# 
# # pheatmap_biomarkers_split_response_ubi_counts
# p4 <- pheatmap_biomarkers(biomarkers_clinical_ubi_counts, biomarkers_numeric_vec, biomarkers_categorical_vec, response_col = "Response", ubi_counts = TRUE, split_response = TRUE)
# 
# # Save the ggplot
# plot_path <- file.path(figures_expression_path,
#                       "pheatmap_biomarkers_split_response_ubi_counts.png")
# ggsave(plot_path, p4, width = 16, height = 10, dpi = 300)
# 
# # pheatmap_biomarkers_both_response_ubi_binary
# p5 <- pheatmap_biomarkers(biomarkers_clinical_ubi_counts, biomarkers_numeric_vec, biomarkers_categorical_vec, response_col = "resp_3_cat", ubi_counts = FALSE, split_response = FALSE)
# 
# # Save the plot using the png function
# plot_path <- file.path(figures_expression_path, "pheatmap_biomarkers_three_response_ubi_binary.png")
# png(filename = plot_path, width = 8, height = 6, units = "in", res = 300)
# print(p5)
# dev.off()
# 
# # pheatmap_biomarkers_split_response_ubi_binary
# p6 <- pheatmap_biomarkers(biomarkers_clinical_ubi_counts, biomarkers_numeric_vec, biomarkers_categorical_vec, response_col = "resp_3_cat", ubi_counts = FALSE, split_response = TRUE)
# 
# # Save the ggplot
# plot_path <- file.path(figures_expression_path,
#                       "pheatmap_biomarkers_three_split_response_ubi_binary.png")
# ggsave(plot_path, p6, width = 24, height = 10, dpi = 300)
# 
# # pheatmap_biomarkers_both_response_ubi_counts
# p7 <- pheatmap_biomarkers(biomarkers_clinical_ubi_counts, biomarkers_numeric_vec, biomarkers_categorical_vec, response_col = "resp_3_cat", ubi_counts = TRUE, split_response = FALSE)
# 
# # Save the plot using the png function
# plot_path <- file.path(figures_expression_path, "pheatmap_biomarkers_three_response_ubi_counts.png")
# png(filename = plot_path, width = 10, height = 6, units = "in", res = 300)
# print(p7)
# dev.off()
# 
# # pheatmap_biomarkers_split_response_ubi_counts
# p8 <- pheatmap_biomarkers(biomarkers_clinical_ubi_counts, biomarkers_numeric_vec, biomarkers_categorical_vec, response_col = "resp_3_cat", ubi_counts = TRUE, split_response = TRUE)
# 
# # Save the ggplot
# plot_path <- file.path(figures_expression_path,
#                       "pheatmap_biomarkers_three_split_response_ubi_counts.png")
# ggsave(plot_path, p8, width = 24, height = 10, dpi = 300)

5.1 Ubiquitin LoF and expression plot

# Fig14

# Normalize biomarkers expression
ub_expression <- biomarkers_ubi_exp_df
ub_expression_vec <- biomarkers_ubi_exp_vec

## Select all columns except the first two for normalization
columns_to_normalize <- colnames(ub_expression)[-(1:3)]

## Perform min-max normalization on the selected columns
ub_expression[columns_to_normalize] <- lapply(ub_expression[columns_to_normalize], function(x) (x - min(x)) / (max(x) - min(x)))

# Pivot long
ub_expression_long <- ub_expression %>% 
  tidyr::pivot_longer(cols = -c(SampleID, Response, resp_3_cat),
                      names_to = "corrected_hugo_symbol",
                      values_to = "expression")

gene_ub_cat <- cat_lof_ub_df %>% 
  dplyr::select(corrected_hugo_symbol, category, ub_category) %>% 
  dplyr::distinct()

correct_hugo <- cat_lof_ub_df %>% 
  dplyr::select(hugo_symbol, corrected_hugo_symbol) %>% 
  dplyr::distinct()

# Identify genes with both "C19_genes" and another category within "deubiquitination"
genes_with_multiple_categories <- gene_ub_cat %>%
  dplyr::filter(ub_category == "deubiquitination") %>%
  group_by(corrected_hugo_symbol) %>%
  dplyr::filter(n_distinct(category) > 1) %>%
  pull(corrected_hugo_symbol) %>%
  unique()

# Filter the data frame to remove "C19_genes" for those genes only
gene_ub_cat <- gene_ub_cat %>%
  dplyr::filter(!(corrected_hugo_symbol %in% genes_with_multiple_categories & ub_category == "deubiquitination" & category == "C19_genes"))

ub_lof_expression <- ub_expression_long %>% 
  dplyr::left_join(gene_ub_cat, by="corrected_hugo_symbol")

lof_ub_binary_path <- file.path(results_ubi_db_path, 
                      "lof_ub_binary.rds")
lof_ub_binary <- readRDS(lof_ub_binary_path)

lof_ub_binary_filtered <- lof_ub_binary %>% 
  dplyr::filter(rowSums(dplyr::select(., -ensembl_id, -hugo_symbol)) != 0) %>% 
  dplyr::left_join(correct_hugo, by="hugo_symbol") %>% 
  dplyr::select(-ensembl_id, -hugo_symbol) %>% 
  dplyr::select(corrected_hugo_symbol, everything())

# Pivot lof_ub_binary_filtered to long format
lof_ub_long <- lof_ub_binary_filtered %>%
  pivot_longer(cols = -corrected_hugo_symbol, 
               names_to = "SampleID", 
               values_to = "ubi_lof")

# Join the data frames
ub_lof_expression_presence <- ub_lof_expression %>%
  dplyr::left_join(lof_ub_long, by = c("SampleID", "corrected_hugo_symbol")) %>% 
  dplyr::mutate(ubi_lof = as.character(ubi_lof)) %>%
  dplyr::mutate(ubi_lof = ifelse(is.na(ubi_lof), "0", ubi_lof))

# Calculate the mean expression for each SampleID
median_expression <- ub_lof_expression_presence %>%
  group_by(SampleID) %>%
  summarise(median_expression = median(expression, na.rm = TRUE)) %>%
  arrange(desc(median_expression))

# Reorder SampleID based on mean expression
ub_lof_expression_presence$SampleID <- factor(ub_lof_expression_presence$SampleID, levels = median_expression$SampleID)

# Response plot
response_plot <- ggplot(ub_lof_expression_presence, aes(x = SampleID, y = "1", fill = Response)) +
  geom_tile(color = "black", linewidth = 0.5) +
  scale_fill_manual(values = c("R" = "#009E73", "NR" = "#D55E00"),
                     labels = c("R" = "Responders", "NR" = "Non-responders")) +
  labs(x = NULL, 
       y = NULL,
       fill = "Patient Response",
       title = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        legend.position = "right",
        legend.spacing.y = unit(0.1, 'cm'),
        legend.background = element_rect(fill = "white", color = NA),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_rect(fill = "white", color = NA),
        plot.background = element_rect(fill = "white", color = NA),
        plot.margin = margin(0, 0, 0, 0, "pt"))

# Plot using gradient for expression
plot <- ggplot(ub_lof_expression_presence, aes(x = SampleID, y = corrected_hugo_symbol, fill = expression)) +
  geom_tile(color = "black", linewidth = 0.5, na.rm = FALSE) +
  scale_fill_gradient2(low = "#F0E442", mid = "#009E73", high = "#0072B2", midpoint = 0.5, na.value = "#555555") +
  geom_point(data = ub_lof_expression_presence %>% dplyr::filter(ubi_lof == "1"),
             aes(shape = "Present"), color = "#CC79A7", size = 3, show.legend = TRUE) +
  geom_point(data = ub_lof_expression_presence, aes(size = "NA"), shape = NA, color = "#555555")+
  guides(size=guide_legend("Missing expression", override.aes=list(shape=15, size = 6))) +
  labs(x = "Sample ID", y = "Genes with LoF mutation", fill = "Expression Level", shape = "LoF mutation") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank(),
        legend.position = "right",
        legend.spacing.y = unit(0.1, 'cm'),
        legend.background = element_rect(fill = "white", color = NA),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_rect(fill = "white", color = NA),
        plot.background = element_rect(fill = "white", colour = NA),
        plot.margin = margin(0, 0, 0, 0, "pt"))

# Extract legends
legend0 <- get_legend(response_plot)
## Warning in get_plot_component(plot, "guide-box"): Multiple components found;
## returning the first one. To return all, use `return_all = TRUE`.
legend1 <- get_legend(plot)
## Warning: Using size for a discrete variable is not advised.
## Warning: Removed 2222 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning in get_plot_component(plot, "guide-box"): Multiple components found;
## returning the first one. To return all, use `return_all = TRUE`.
# Combine plots without legends
combined_plot <- response_plot + plot + plot_layout(nrow = 2, ncol = 1, heights = c(0.25, 14)) & theme(legend.position = "none")

# Combine legends
combined_legends <- plot_grid(as_ggplot(legend0) + theme(plot.margin = unit(c(0, 0, 0, 0), "pt")),
                              as_ggplot(legend1) + theme(plot.margin = unit(c(20, 0, 0, 0), "pt")),
                              ncol = 2, align = "v", rel_widths = c(1, 1))

# Combine plots and legends
final_plot <- plot_grid(combined_plot, combined_legends, ncol = 2, rel_widths = c(4, 2), rel_heights = c(20, 1)) +
  theme(plot.background = element_rect(fill = "white", color = NA))
## Warning: Using size for a discrete variable is not advised.
## Warning: Removed 2222 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Print the final plot
print(final_plot)

# Save the ggplot
plot_path <- file.path(figures_expression_path,
                      "ubiquitin_lof_presence_and_expression.png")
ggsave(plot_path, final_plot, width = 10, height = 14, dpi = 300)

6 Explore individual gene expression profiles

6.1 Ubiquitin gene expression table

datatable(biomarkers_ubi_exp_df, 
          extensions = c('Buttons', 
                         'FixedColumns'), 
          options = list(
            dom = 'Bfrtip',
            buttons = c('copy', 'excel', 'csv'),
            scrollX=TRUE,
            pageLength=8,
            fixedColumns = list(leftColumns = 2), # Freeze the first 2 columns
            columnDefs = list(list(
              targets = "_all",
              render = JS(
                "function(data, type, row, meta) {",
                "  return data === null ? 'NA' : data;",
                "}"
              )
            ))
          ),
          caption = "Expression analysis of Ubiquitin-related genes with LoF in samples."
        )

6.2 Proteasome subunit expression table

datatable(biomarkers_proteas_exp_df, 
          extensions = c('Buttons', 
                         'FixedColumns'), 
          options = list(
            dom = 'Bfrtip',
            buttons = c('copy', 'excel', 'csv'),
            scrollX=TRUE,
            pageLength=8,
            fixedColumns = list(leftColumns = 2), # Freeze the first 2 columns
            columnDefs = list(list(
              targets = "_all",
              render = JS(
                "function(data, type, row, meta) {",
                "  return data === null ? 'NA' : data;",
                "}"
              )
            ))
          ),
          caption = "Expression analysis of Proteasome subunits."
        )

6.3 Boxplots of proteasome genes

#Fig13

proteas_boxplot_path <- file.path(figures_expression_path, "proteasome_boxplot.png")
proteas_boxplot <- create_proteas_boxplot(biomarkers_proteas_exp_df, biomarkers_proteas_exp_vec)

ggsave(proteas_boxplot_path, proteas_boxplot, width = 12, height = 8, dpi = 300)

print(proteas_boxplot)

7 Check TumorPurity

# Loop through the biomarkers and generate a boxplot for each
plots <- list()
for (biomarker in c("TumorPurity")) {
  p <- create_biomarker_boxplot(biomarkers_clinical_ubi_counts, biomarker)
  plots[[biomarker]] <- p
}

# Print the plots
for (biomarker in c("TumorPurity")) {
  print(plots[[biomarker]])
}

7.1 Number of LoF mutations in ubi vs TumorPurity

# Prepare data

# Count the number of mutations per sample_id
mutations_per_sample <- cat_lof_ub_df %>%
  dplyr::group_by(sample_id) %>%
  dplyr::summarise(mutation_count = n(), .groups = 'drop') %>% 
  dplyr::left_join(biomarkers_clinical_ubi_counts %>% dplyr::select(SampleID, TumorPurity), 
                   by=c("sample_id" = "SampleID"))

# Add samples with no ubi LoF
remaining_samples <- biomarkers_clinical_ubi_counts %>% 
  dplyr::select(SampleID, TumorPurity) %>%
  dplyr::filter(!(SampleID %in% mutations_per_sample$sample_id)) %>%
  dplyr::mutate(mutation_count = 0) %>% 
  dplyr::rename(sample_id = SampleID)

# Combine both data frames
complete_samples <- bind_rows(mutations_per_sample, remaining_samples)

# Response_df
response_df <- biomarkers_clinical_ubi_counts %>% 
  dplyr::rename(sample_id = SampleID) %>% 
  dplyr::select(sample_id, Response) %>% 
  dplyr::distinct()

prepared_data_df <- mutations_per_sample %>% 
  dplyr::left_join(response_df, by="sample_id") %>% 
  dplyr::mutate(short_sample_id = sub(".*_(\\d+)$", "\\1", sample_id))

7.2 Linear regression plot (TumorPurity & Ubi LoF count)

# Fig16

# Calculate regression statistics
regression_model <- lm(mutation_count ~ TumorPurity + Response, data = prepared_data_df)
regression_summary <- summary(regression_model)

# Extract the p-value for Intercept
p_value <- broom::tidy(regression_model) %>%
  dplyr::filter(term == "(Intercept)") %>%
  pull(p.value)

# Create scatter plot with regression line
scatter_plot <- ggplot(prepared_data_df, aes(x = TumorPurity, y = mutation_count, color = Response)) +
  geom_point(size = 3) +
  geom_label_repel(aes(label = short_sample_id), size = 3, color = "black", 
                     max.overlaps = 50, max.time = 2, alpha = 0.5, fill = "white") +
  scale_color_manual(values = c("R" = "#009E73", "NR" = "#D55E00"),
                     labels = c("R" = "Responders", "NR" = "Non-responders")) +
  geom_smooth(method = "lm", se = TRUE, color = "black", linetype = "dashed") +
  labs(title = NULL,
       x = "Tumor Purity",
       y = "nº of LoF mutations",
       color = "Patient Response") +
  scale_y_continuous(breaks = seq(0, 25, by = 5),
                     minor_breaks = seq(0, 25, by = 5)) +
  scale_x_continuous(breaks = seq(0, 1, by = 0.1),
                     minor_breaks = seq(0, 1, by = 0.1)) +
  coord_cartesian(ylim = c(0, NA)) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5),
        panel.grid.minor = element_blank(),
        panel.background = element_rect(fill = "white", colour = "black"),
        plot.background = element_rect(fill = "white", colour = NA)) +
  annotate("text", x = Inf, y = Inf, label = paste("p-value:", format(p_value, digits = 3)),
           hjust = 1.5, vjust = 18, size = 4, color = "black")

# ggsave
ggsave(file.path(figures_expression_path, "regression_tp_ubi.png"), scatter_plot, width = 9, height = 5, dpi = 300)
## `geom_smooth()` using formula = 'y ~ x'
# Print the scatter plot
print(scatter_plot)
## `geom_smooth()` using formula = 'y ~ x'

Looks like it is not correlated significatively (p>0.05).

8 Session Info

sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: x86_64-apple-darwin20 (64-bit)
## Running under: macOS Monterey 12.7.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Copenhagen
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    grid      stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] tidyHeatmap_1.8.1           ComplexHeatmap_2.18.0      
##  [3] GO.db_3.18.0                mygene_1.38.0              
##  [5] ggupset_0.3.0               clusterProfiler_4.10.1     
##  [7] limma_3.58.1                fgsea_1.28.0               
##  [9] EnsDb.Hsapiens.v86_2.99.0   EnsDb.Hsapiens.v75_2.99.0  
## [11] ensembldb_2.26.0            AnnotationFilter_1.26.0    
## [13] GenomicFeatures_1.54.4      GSEABase_1.64.0            
## [15] graph_1.80.0                annotate_1.80.0            
## [17] XML_3.99-0.16.1             AnnotationDbi_1.64.1       
## [19] maftools_2.18.0             biomaRt_2.58.2             
## [21] VariantAnnotation_1.48.1    Rsamtools_2.18.0           
## [23] Biostrings_2.70.3           XVector_0.42.0             
## [25] SummarizedExperiment_1.32.0 Biobase_2.62.0             
## [27] GenomicRanges_1.54.1        GenomeInfoDb_1.38.8        
## [29] IRanges_2.36.0              S4Vectors_0.40.2           
## [31] MatrixGenerics_1.14.0       matrixStats_1.3.0          
## [33] BiocGenerics_0.48.1         magrittr_2.0.3             
## [35] dendextend_1.17.1           corrplot_0.92              
## [37] broom_1.0.5                 ggrepel_0.9.5              
## [39] ggplotify_0.1.2             pheatmap_1.0.12            
## [41] cowplot_1.1.3               ggnewscale_0.4.10          
## [43] reshape2_1.4.4              ggdendro_0.2.0             
## [45] ggpubr_0.6.0                patchwork_1.2.0            
## [47] gridtext_0.1.5              circlize_0.4.16            
## [49] VennDiagram_1.7.3           futile.logger_1.4.3        
## [51] HGNChelper_0.8.1            msigdbr_7.5.1              
## [53] jsonlite_1.8.8              httr_1.4.7                 
## [55] DT_0.33                     knitr_1.45                 
## [57] lubridate_1.9.3             forcats_1.0.0              
## [59] stringr_1.5.1               dplyr_1.1.4                
## [61] purrr_1.0.2                 readr_2.1.5                
## [63] tidyr_1.3.1                 tibble_3.2.1               
## [65] ggplot2_3.5.1               tidyverse_2.0.0            
## [67] readxl_1.4.3                rprojroot_2.0.4            
## 
## loaded via a namespace (and not attached):
##   [1] progress_1.2.3              nnet_7.3-19                
##   [3] HDF5Array_1.30.1            vctrs_0.6.5                
##   [5] digest_0.6.35               png_0.1-8                  
##   [7] shape_1.4.6.1               proxy_0.4-27               
##   [9] MASS_7.3-60.0.1             foreach_1.5.2              
##  [11] qvalue_2.34.0               withr_3.0.0                
##  [13] xfun_0.44                   ggfun_0.1.4                
##  [15] survival_3.6-4              memoise_2.0.1              
##  [17] gson_0.1.0                  systemfonts_1.1.0          
##  [19] ragg_1.3.2                  tidytree_0.4.6             
##  [21] zoo_1.8-12                  GlobalOptions_0.1.2        
##  [23] DNAcopy_1.76.0              Formula_1.2-5              
##  [25] prettyunits_1.2.0           KEGGREST_1.42.0            
##  [27] rstatix_0.7.2               restfulr_0.0.15            
##  [29] rhdf5filters_1.14.1         rhdf5_2.46.1               
##  [31] rstudioapi_0.16.0           generics_0.1.3             
##  [33] DOSE_3.28.2                 base64enc_0.1-3            
##  [35] babelgene_22.9              curl_5.2.1                 
##  [37] zlibbioc_1.48.2             ScaledMatrix_1.10.0        
##  [39] ggraph_2.2.1                polyclip_1.10-6            
##  [41] GenomeInfoDbData_1.2.11     quadprog_1.5-8             
##  [43] SparseArray_1.2.4           xtable_1.8-4               
##  [45] doParallel_1.0.17           evaluate_0.23              
##  [47] S4Arrays_1.2.1              BiocFileCache_2.10.2       
##  [49] preprocessCore_1.64.0       hms_1.1.3                  
##  [51] glmnet_4.1-8                irlba_2.3.5.1              
##  [53] colorspace_2.1-0            filelock_1.0.3             
##  [55] viridis_0.6.5               ggtree_3.10.1              
##  [57] limSolve_1.5.7.1            lattice_0.22-6             
##  [59] shadowtext_0.1.3            class_7.3-22               
##  [61] Hmisc_5.1-2                 pillar_1.9.0               
##  [63] nlme_3.1-164                iterators_1.0.14           
##  [65] beachmat_2.18.1             compiler_4.3.2             
##  [67] stringi_1.8.4               GenomicAlignments_1.38.2   
##  [69] plyr_1.8.9                  crayon_1.5.2               
##  [71] abind_1.4-5                 BiocIO_1.12.0              
##  [73] gridGraphics_0.5-1          chron_2.3-61               
##  [75] locfit_1.5-9.9              graphlayouts_1.1.1         
##  [77] org.Hs.eg.db_3.18.0         bit_4.0.5                  
##  [79] fastmatch_1.1-4             textshaping_0.3.7          
##  [81] codetools_0.2-20            BiocSingular_1.18.0        
##  [83] crosstalk_1.2.1             bslib_0.7.0                
##  [85] e1071_1.7-14                GetoptLong_1.0.5           
##  [87] splines_4.3.2               Rcpp_1.0.12                
##  [89] dbplyr_2.5.0                sparseMatrixStats_1.14.0   
##  [91] HDO.db_0.99.1               cellranger_1.1.0           
##  [93] blob_1.2.4                  utf8_1.2.4                 
##  [95] clue_0.3-65                 fs_1.6.4                   
##  [97] checkmate_2.3.1             DelayedMatrixStats_1.24.0  
##  [99] GSVA_1.50.5                 ggsignif_0.6.4             
## [101] sqldf_0.4-11                Matrix_1.6-5               
## [103] statmod_1.5.0               tzdb_0.4.0                 
## [105] lpSolve_5.6.20              tweenr_2.0.3               
## [107] pkgconfig_2.0.3             tools_4.3.2                
## [109] cachem_1.0.8                RSQLite_2.3.6              
## [111] viridisLite_0.4.2           DBI_1.2.2                  
## [113] fastmap_1.2.0               rmarkdown_2.26             
## [115] scales_1.3.0                sass_0.4.9                 
## [117] BiocManager_1.30.23         carData_3.0-5              
## [119] rpart_4.1.23                farver_2.1.2               
## [121] mgcv_1.9-1                  survminer_0.4.9            
## [123] tidygraph_1.3.1             scatterpie_0.2.2           
## [125] gsubfn_0.7                  yaml_2.3.8                 
## [127] foreign_0.8-86              rtracklayer_1.62.0         
## [129] cli_3.6.2                   lifecycle_1.0.4            
## [131] lambda.r_1.2.4              backports_1.4.1            
## [133] BiocParallel_1.36.0         timechange_0.3.0           
## [135] gtable_0.3.5                rjson_0.2.21               
## [137] parallel_4.3.2              ape_5.8                    
## [139] bitops_1.0-7                bit64_4.0.5                
## [141] yulab.utils_0.1.4           proto_1.0.0                
## [143] futile.options_1.0.1        highr_0.10                 
## [145] jquerylib_0.1.4             GOSemSim_2.28.1            
## [147] survMisc_0.5.6              lazyeval_0.2.2             
## [149] htmltools_0.5.8.1           enrichplot_1.22.0          
## [151] KMsurv_0.1-5                rappdirs_0.3.3             
## [153] formatR_1.14                glue_1.7.0                 
## [155] RCurl_1.98-1.14             treeio_1.26.0              
## [157] BSgenome_1.70.2             gridExtra_2.3              
## [159] igraph_2.0.3                R6_2.5.1                   
## [161] DESeq2_1.42.1               SingleCellExperiment_1.24.0
## [163] labeling_0.4.3              km.ci_0.5-6                
## [165] cluster_2.1.6               Rhdf5lib_1.24.2            
## [167] aplot_0.2.2                 DelayedArray_0.28.0        
## [169] tidyselect_1.2.1            ProtGenerics_1.34.0        
## [171] htmlTable_2.4.2             ggforce_0.4.2              
## [173] xml2_1.3.6                  car_3.1-2                  
## [175] rsvd_1.0.5                  munsell_0.5.1              
## [177] data.table_1.15.4           htmlwidgets_1.6.4          
## [179] RColorBrewer_1.1-3          rlang_1.1.3                
## [181] IOBR_0.99.8                 fansi_1.0.6